#Continuation of Exploratory Data Analysis Here I will continue the exploratory data analysis using R.
#Load dataframes manipulated from Jupyter Notebook
avg_genes=read.csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/avg_genes.csv')
avg_genes_nontoxic=read.csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/avg_genes_nontoxic.csv')
avg_genes_toxic=read.csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/avg_genes_toxic.csv')
Day_16_RNA_seq=read.csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/Day_16_RNA_seq.csv')
avg_genes
avg_genes$X <- NULL
avg_genes
#Install Relevant Packages
install.packages("devtools")
install.packages('dendextend')
install.packages('dendextendRcpp')
BiocManager::install("umap", update = FALSE)
#Load Relevant Libraries
library(pheatmap)#plotting heatmaps
library(grid)#plotting
library(gplots) #plotting
library(dendextend) #For dendogram visualization
library(DESeq2)#RNA Seq Analysis
library(umap)
library(dplyr)#For manipulation of dataframes
library(tidyverse) #PCA Analysis
avg_genes_correlation<-cor(avg_genes,method = "spearman")
pheatmap(avg_genes_correlation,color=rainbow(50),fontsize_row=4, fontsize_col=4,border_color=NA)
heatmap.2(as.matrix(avg_genes_correlation),
main = "Heatmap for the Average gene count on Day 16 and 21 after chemical exposure",cexRow=0.5
)
iris_species <- rev(levels(iris[,5]))
# order it the closest we can to the order of the observations:
dend <- rotate(dend, 1:150)
# Color the branches based on the clusters:
dend <- color_branches(dend, k=3) #, groupLabels=iris_species)
# Manually match the labels, as much as possible, to the real classification of the flowers:
labels_colors(dend) <-
rainbow_hcl(3)[sort_levels_values(
as.numeric(iris[,5])[order.dendrogram(dend)]
)]
# We shall add the flower type to the labels:
labels(dend) <- paste(as.character(iris[,5])[order.dendrogram(dend)],
"(",labels(dend),")",
sep = "")
# We hang the dendrogram a bit:
dend <- hang.dendrogram(dend,hang_height=0.1)
# reduce the size of the labels:
# dend <- assign_values_to_leaves_nodePar(dend, 0.5, "lab.cex")
dend <- set(dend, "labels_cex", 0.5)
# And plot:
par(mar = c(3,3,3,7))
plot(dend,
main = "Clustered Iris data set
(the labels give the true flower species)",
horiz = TRUE, nodePar = list(cex = .007))
legend("topleft", legend = iris_species, fill = rainbow_hcl(3))
#Differential Gene Expression Analysis using DESeq2 Here I will be using the DESeq2 package from bioconductor in order to conduct differential gene expression analysis. I will begin the analysis with some additional exploratory data asnalysis including Principal Component Analysis (PCA).
expgroup<-data_frame()
condition_toxic<-rep('Toxic',68,)
condition_nontoxic<-rep('Nontoxic',72)
condition<-c(condition_nontoxic,condition_toxic)
samples<-colnames(avg_genes)
expgroup<-cbind(condition)
rownames(expgroup)<-samples
expgroup
condition
b1a "Nontoxic"
b1b "Nontoxic"
b2a "Nontoxic"
b2b "Nontoxic"
b3a "Nontoxic"
b3b "Nontoxic"
b4a "Nontoxic"
b4b "Nontoxic"
b5a "Nontoxic"
b5b "Nontoxic"
b6a "Nontoxic"
b6b "Nontoxic"
b7a "Nontoxic"
b7b "Nontoxic"
b8a "Nontoxic"
b8b "Nontoxic"
b9a "Nontoxic"
b9b "Nontoxic"
b10a "Nontoxic"
b10b "Nontoxic"
c1a "Nontoxic"
c1b "Nontoxic"
c2a "Nontoxic"
c2b "Nontoxic"
c3a "Nontoxic"
c3b "Nontoxic"
c4a "Nontoxic"
c4b "Nontoxic"
c5a "Nontoxic"
c5b "Nontoxic"
c6a "Nontoxic"
c6b "Nontoxic"
c7a "Nontoxic"
c7b "Nontoxic"
c8a "Nontoxic"
c8b "Nontoxic"
c9a "Nontoxic"
c9b "Nontoxic"
c10a "Nontoxic"
c10b "Nontoxic"
c11a "Nontoxic"
c11b "Nontoxic"
c12a "Nontoxic"
c12b "Nontoxic"
c13a "Nontoxic"
c13b "Nontoxic"
c14a "Nontoxic"
c14b "Nontoxic"
c15a "Nontoxic"
c15b "Nontoxic"
c16a "Nontoxic"
c16b "Nontoxic"
c17a "Nontoxic"
c17b "Nontoxic"
c18a "Nontoxic"
c18b "Nontoxic"
c19a "Nontoxic"
c19b "Nontoxic"
c20a "Nontoxic"
c20b "Nontoxic"
c21a "Nontoxic"
c21b "Nontoxic"
c22a "Nontoxic"
c22b "Nontoxic"
c23a "Nontoxic"
c23b "Nontoxic"
c24a "Nontoxic"
c24b "Nontoxic"
c25a "Nontoxic"
c25b "Nontoxic"
c26a "Nontoxic"
c26b "Nontoxic"
t1a "Toxic"
t1b "Toxic"
t2a "Toxic"
t2b "Toxic"
t3a "Toxic"
t3b "Toxic"
t4a "Toxic"
t4b "Toxic"
t5a "Toxic"
t5b "Toxic"
t6a "Toxic"
t6b "Toxic"
t7a "Toxic"
t7b "Toxic"
t8a "Toxic"
t8b "Toxic"
t9a "Toxic"
t9b "Toxic"
t10a "Toxic"
t10b "Toxic"
t11a "Toxic"
t11b "Toxic"
t12a "Toxic"
t12b "Toxic"
t13a "Toxic"
t13b "Toxic"
t14a "Toxic"
t14b "Toxic"
t15a "Toxic"
t15b "Toxic"
t16a "Toxic"
t16b "Toxic"
t17a "Toxic"
t17b "Toxic"
t18a "Toxic"
t18b "Toxic"
t19a "Toxic"
t19b "Toxic"
t20a "Toxic"
t20b "Toxic"
t21a "Toxic"
t21b "Toxic"
t22a "Toxic"
t22b "Toxic"
t23a "Toxic"
t23b "Toxic"
t24a "Toxic"
t24b "Toxic"
t25a "Toxic"
t25b "Toxic"
t26a "Toxic"
t26b "Toxic"
t27a "Toxic"
t27b "Toxic"
t28a "Toxic"
t28b "Toxic"
t29a "Toxic"
t29b "Toxic"
t30a "Toxic"
t30b "Toxic"
t31a "Toxic"
t31b "Toxic"
t32a "Toxic"
t32b "Toxic"
t33a "Toxic"
t33b "Toxic"
t34a "Toxic"
t34b "Toxic"
UMAP steps: https://alexslemonade.github.io/refinebio-examples/03-rnaseq/dimension-reduction_rnaseq_02_umap.html
##Principal Component Analysis (PCA)
#Principal component analysis for data before differential gene expression analysis
#We expect data to cluster depending on whether the neural construct was exposed to a toxic chemical or not.
pca_matrix_avg_genes <- as.matrix(t(avg_genes))
sample_pca <- prcomp(pca_matrix_avg_genes)
as_tibble(pca_matrix_avg_genes)# Convert matrix to tibble (a tibble is a tidyverse version of a dataframe)